Shu Xu (shuxu3@illinois.edu): Part I&III
Yan Han (yanhan4@illinois.edu): Part II
Amrit Kumar(amritk2@illinois.edu): Part I
We finish this notebook together.
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from scipy.interpolate import splev, interp1d
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from skmisc.loess import loess
Write your own function to employ LOO-CV and GCV in selecting the optimal span for LOESS. Definitions of LOO-CV and GCV can be found on page 33 of [lec_W5_NonlinearRegression.pdf]
The true curve is $$ f(x) = \frac{sin[12(x + 0.2)]}{x + 0.2} $$
Note: Your plot may differ from the one provided below. You’re encouraged to use your own color schemes and styles.
Computing the Diagonal of the Smoother Matrix:
Create a function to retrieve the diagonal of the smoother matrix. We’re only interested in the diagonal entries (which will be used in computing LOO-CV and GCV), so this function should return an n-by-1 vector.
# Load the data from the provided CSV file
data_url = "https://liangfgithub.github.io/Data/Coding3_Data.csv"
data = pd.read_csv(data_url)
x = data['x'].values
y = data['y'].values
def retrieve_diagnoal_of_smooth_matrix(x: np.ndarray,
span:float) -> np.ndarray:
n = len(x)
Y = np.identity(n)
result = np.zeros(n)
for i in range(n):
result[i] = loess(x, Y[i,:], span=span).predict(x).values[i]
return result
Span Value Iteration
# Function to perform LOO-CV/Generalized CV for a given span
def cross_validation(x: np.ndarray,
y: np.ndarray,
span: float) -> tuple[float, float]:
n = len(x)
g_hat = loess(x, y, span=span).predict(x).values
error = y - g_hat
S = retrieve_diagnoal_of_smooth_matrix(x, span)
trace_S = np.sum(S)
loo_cv_res = 0
gcv_res = 0
for i in range(n):
loo_cv_res += (error[i]/(1 - S[i]))**2
gcv_res += (error[i]/(1 - trace_S/n))**2
return loo_cv_res/n, gcv_res/n
loocv_values = []
gcv_values = []
span_values = np.arange(0.20, 0.95, 0.05)
for span in span_values:
loocv_val, gcv_val = cross_validation(x, y, span)
loocv_values.append(loocv_val)
gcv_values.append(gcv_val)
result_dict = {'Span': span_values,
'loocv': loocv_values,
'gcv': gcv_values}
result_df = pd.DataFrame(result_dict)
print(result_df)
# Find the optimal span based on CV and GCV
optimal_span_loocv = span_values[np.argmin(loocv_values)]
optimal_span_gcv = span_values[np.argmin(gcv_values)]
print()
print("Optimal Span of LOOCV is: {0:1.2f}".format(optimal_span_loocv))
print("Optimal Span of GCV is: {0:1.2f}".format(optimal_span_gcv))
print("The optimal span overall is {0: 1.2f}".format(optimal_span_gcv))
Span loocv gcv 0 0.20 12.415911 2.110162 1 0.25 2.241473 1.489206 2 0.30 1.502980 1.190110 3 0.35 1.259175 1.174423 4 0.40 1.190380 1.102540 5 0.45 1.156812 1.062503 6 0.50 1.125652 1.041833 7 0.55 1.179664 1.118841 8 0.60 1.179464 1.119269 9 0.65 1.250914 1.180585 10 0.70 1.553562 1.519091 11 0.75 1.636175 1.627429 12 0.80 1.764534 1.744549 13 0.85 1.976094 1.925696 14 0.90 2.035108 1.979820 Optimal Span of LOOCV is: 0.50 Optimal Span of GCV is: 0.50 The optimal span overall is 0.50
new_x = np.linspace(min(x), max(x), 100)
true_y = np.sin(12 *(new_x + 0.2))/(new_x + 0.2)
fitted_y = loess(x, y, span=0.5).predict(new_x).values
fig = go.Figure()
fig.update_layout(width=1000,height=500,
xaxis_title="x", yaxis_title="Y")
fig.add_trace(go.Scatter(x=x, y=y,
mode='markers',
name= 'Raw Data',
marker_symbol='circle-open',
marker_size =10,
marker_color="red"))
fig.add_trace(go.Scatter(x=new_x, y=true_y,
name= 'True curve',
line=dict(color='grey')))
fig.add_trace(go.Scatter(x=new_x, y=fitted_y,
name= 'Fitted curve',
line=dict(color='blue', dash='dash')))
# converted from R's ns()
def ns(x, df=None, knots=None, boundary_knots=None, include_intercept=False):
degree = 3
if boundary_knots is None:
boundary_knots = [np.min(x), np.max(x)]
else:
boundary_knots = np.sort(boundary_knots).tolist()
oleft = x < boundary_knots[0]
oright = x > boundary_knots[1]
outside = oleft | oright
inside = ~outside
if df is not None:
nIknots = df - 1 - include_intercept
if nIknots < 0:
nIknots = 0
if nIknots > 0:
knots = np.linspace(0, 1, num=nIknots + 2)[1:-1]
knots = np.quantile(x[~outside], knots)
Aknots = np.sort(np.concatenate((boundary_knots * 4, knots)))
n_bases = len(Aknots) - (degree + 1)
if any(outside):
basis = np.empty((x.shape[0], n_bases), dtype=float)
e = 1 / 4 # in theory anything in (0, 1); was (implicitly) 0 in R <= 3.2.2
if any(oleft):
k_pivot = boundary_knots[0]
xl = x[oleft] - k_pivot
xl = np.c_[np.ones(xl.shape[0]), xl]
# equivalent to splineDesign(Aknots, rep(k.pivot, ord), ord, derivs)
tt = np.empty((xl.shape[1], n_bases), dtype=float)
for j in range(xl.shape[1]):
for i in range(n_bases):
coefs = np.zeros((n_bases,))
coefs[i] = 1
tt[j, i] = splev(k_pivot, (Aknots, coefs, degree), der=j)
basis[oleft, :] = xl @ tt
if any(oright):
k_pivot = boundary_knots[1]
xr = x[oright] - k_pivot
xr = np.c_[np.ones(xr.shape[0]), xr]
tt = np.empty((xr.shape[1], n_bases), dtype=float)
for j in range(xr.shape[1]):
for i in range(n_bases):
coefs = np.zeros((n_bases,))
coefs[i] = 1
tt[j, i] = splev(k_pivot, (Aknots, coefs, degree), der=j)
basis[oright, :] = xr @ tt
if any(inside):
xi = x[inside]
tt = np.empty((len(xi), n_bases), dtype=float)
for i in range(n_bases):
coefs = np.zeros((n_bases,))
coefs[i] = 1
tt[:, i] = splev(xi, (Aknots, coefs, degree))
basis[inside, :] = tt
else:
basis = np.empty((x.shape[0], n_bases), dtype=float)
for i in range(n_bases):
coefs = np.zeros((n_bases,))
coefs[i] = 1
basis[:, i] = splev(x, (Aknots, coefs, degree))
const = np.empty((2, n_bases), dtype=float)
for i in range(n_bases):
coefs = np.zeros((n_bases,))
coefs[i] = 1
const[:, i] = splev(boundary_knots, (Aknots, coefs, degree), der=2)
if include_intercept is False:
basis = basis[:, 1:]
const = const[:, 1:]
qr_const = np.linalg.qr(const.T, mode='complete')[0]
basis = (qr_const.T @ basis.T).T[:, 2:]
return basis
def ns_predict(new, x, df=None, knots=None, boundary_knots=None, include_intercept=False):
degree = 3
if boundary_knots is None:
boundary_knots = [np.min(x), np.max(x)]
else:
boundary_knots = np.sort(boundary_knots).tolist()
if df is not None:
nIknots = df - 1 - include_intercept
if nIknots < 0:
nIknots = 0
if nIknots > 0:
knots = np.linspace(0, 1, num=nIknots + 2)[1:-1]
knots = np.quantile(x, knots)
basis = ns(new, df=df, knots=knots, boundary_knots=boundary_knots,
include_intercept=include_intercept)
return basis
data_df = pd.read_csv('Sales_Transactions_Dataset_Weekly.csv')
data_df = data_df.set_index('Product_Code')
data_df = data_df.iloc[:, :52]
data_df.head()
| W0 | W1 | W2 | W3 | W4 | W5 | W6 | W7 | W8 | W9 | ... | W42 | W43 | W44 | W45 | W46 | W47 | W48 | W49 | W50 | W51 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Product_Code | |||||||||||||||||||||
| P1 | 11 | 12 | 10 | 8 | 13 | 12 | 14 | 21 | 6 | 14 | ... | 4 | 7 | 8 | 10 | 12 | 3 | 7 | 6 | 5 | 10 |
| P2 | 7 | 6 | 3 | 2 | 7 | 1 | 6 | 3 | 3 | 3 | ... | 2 | 4 | 5 | 1 | 1 | 4 | 5 | 1 | 6 | 0 |
| P3 | 7 | 11 | 8 | 9 | 10 | 8 | 7 | 13 | 12 | 6 | ... | 6 | 14 | 5 | 5 | 7 | 8 | 14 | 8 | 8 | 7 |
| P4 | 12 | 8 | 13 | 5 | 9 | 6 | 9 | 13 | 13 | 11 | ... | 9 | 10 | 3 | 4 | 6 | 8 | 14 | 8 | 7 | 8 |
| P5 | 8 | 5 | 13 | 11 | 6 | 7 | 9 | 14 | 9 | 9 | ... | 7 | 11 | 7 | 12 | 6 | 6 | 5 | 11 | 8 | 9 |
5 rows × 52 columns
X = data_df.to_numpy(dtype=np.float16)
X = X - np.mean(X, axis=1, keepdims=True)
np.random.seed(722)
n_products = X.shape[0]
n_weeks = X.shape[1]
df = 10
x = np.arange(1, n_weeks+1)
F = ns(x, df=df-1)
F = F - np.mean(F, axis=0, keepdims=True)
B = np.matmul(np.matmul(np.linalg.inv(np.matmul(F.T, F)), F.T), X.T).T
n_clusters = 6
n_rows = 2
n_cols = 3
B_clusters = KMeans(n_clusters=n_clusters).fit_predict(B)
plt.figure(figsize=(10,6))
for i in range(n_clusters):
mask = B_clusters == i
plt.subplot(n_rows, n_cols, i+1)
plt.plot(X[mask].T, color='gray')
plt.plot(np.matmul(F, np.mean(B[mask], axis=0)), color='red')
plt.xlabel('Weeks')
plt.ylabel('Weekly Sales')
plt.tight_layout()
X_clusters = KMeans(n_clusters=n_clusters).fit_predict(X)
plt.figure(figsize=(10,6))
for i in range(n_clusters):
mask = X_clusters == i
plt.subplot(n_rows, n_cols, i+1)
curr_ts = X[mask]
plt.plot(curr_ts.T, color='gray')
plt.plot(np.mean(curr_ts, axis=0), color='red')
plt.xlabel('Weeks')
plt.ylabel('Weekly Sales')
plt.tight_layout()
So far in our course, we’ve utilized the U-shaped bias-variance trade-off curve as a pivotal tool for model selection. This has aided us in methodologies such as ridge/lasso regression, tree pruning, and smoothing splines, among others.
A key observation is that when a model interpolates training data to the extent that the Residual Sum of Squares (RSS) equals zero, it’s typically a red flag signaling overfitting. Such models are anticipated to perform inadequately when presented with new, unseen data.
However, in modern practice, very rich models such as neural networks are trained to exactly fit (i.e., interpolate) the data. Classically, such models would be considered overfitted, and yet they often obtain high accuracy on test data. This apparent contradiction has raised questions about the mathematical foundations of machine learning and their relevance to practitioners. (Belkin et al. 2019)
In this assignment, we will use Ridgeless to illustrate the double descent phenomenon. Our setup is similar to, but not the same as, Section 8 in Hastie (2020).
Remember the dataset used in Coding 2 Part I? It consisted of 506 rows (i.e., n = 506) and 14 columns: Y, X1 through X13.
Based on this dataset, we have formed Coding3_dataH.csv, which is structured as follows:
Ridgeless least squares can be equated with principal component regression (PCR) when all principal components are employed. For our simulation study, we’ll employ the PCR version with the scale = FALSE option, implying that we’ll center each column of the design matrix from the training data without scaling.
Your task is to write a function that accepts training and test datasets and returns the training and test errors of the ridgeless estimator. For both datasets, the initial column represents the response vector Y.
You can use R/Python packages or built-in functions for PCA/SVD, but you are not allowed to use packages or functions tailored for linear regression, PCR, or ridge regression.
Post PCA/SVD, you’ll notice that the updated design matrix comprises orthogonal columns. This allows for the calculation of least squares coefficients through simple matrix multiplication, eliminating the need for matrix inversion.
For computation stability, you need to exclude directions with extremely small eigenvalues (in PCA) or singular values (in SVD). As a reference, consider setting eps = 1e-10 as the threshold for singular values.
Although training errors aren’t a requisite for our simulation, I recommend including them in the ridgeless output. This serves as a useful debugging tool. Ideally, your training error should align with the RSS derived from a standard linear regression model.
def ridgeless_function(training_data:np.ndarray, testing_data:np.ndarray) -> float:
X_train = training_data[:, 1:]
Y_train = training_data[:, 0]
X_test = testing_data[:, 1:]
Y_test = testing_data[:, 0]
scaler = StandardScaler(with_mean=True, with_std=False)
pca = PCA()
pipeline = Pipeline([('scaling', scaler), ('pca', pca)])
pipeline.fit(X_train)
X_train = pipeline.transform(X_train) # X_train changes to XtX shape
X_train = X_train[:, pca.singular_values_>1e-10] # setting threshold for comoputational stability
coefs =Y_train.T @ X_train / np.sum(X_train**2, axis=0)
b0 = np.mean(Y_train)
X_test = pipeline.transform(X_test) # X_test changes to XtX covariance shape
X_test = X_test[:, pca.singular_values_>1e-10]
preds = X_test @ coefs.T + b0
log_test_error = np.log(np.mean((Y_test-preds)**2))
return log_test_error
Execute the procedure below for T = 30 times.
In each iteration,
This will result in recording 236 test errors per iteration. These errors are the averaged mean squared errors based on the test data. One practical way to manage this data would be to maintain a matrix of dimensions 30-by-236 to house the test errors derived from this simulation study.
Graphical display: Plot the median of the test errors (collated over the 30 iterations) in log scale against the count of regression parameters, which spans from 5 to 240.
Shu_UIN = 8298 # Last 4-digits of my UIN
T = 30
N_PARAM = 236
Data = pd.read_csv("Coding3_dataH.csv", header=None)
log_test_error_array= np.zeros((T, N_PARAM))
for t in range(T):
train_t, test_t = train_test_split(Data.values, test_size=0.75, random_state=Shu_UIN+t)
for d in range(6, 242):
train_t_d, test_t_d = train_t[:, :d], test_t[:, :d]
log_test_error_array[t, d-6] = ridgeless_function(train_t_d, test_t_d)
log_test_error_median_array = np.median(log_test_error_array, axis=0)
number_of_feature_array = np.linspace(5, 240, 236).astype(int)
fig = go.Figure()
fig.update_layout(width=1000,height=500,
xaxis_title="# of features",
yaxis_title="Log of Test Error")
fig.add_trace(go.Scatter(x=number_of_feature_array,
y=log_test_error_median_array,
mode='markers',
marker_color="blue"))